#!/bin/bash
#SBATCH --partition=a6000
#SBATCH --job-name=ovss_train
#SBATCH --output=slurm_logs/slurm_%x_%j.out
#SBATCH --error=slurm_logs/slurm_%x_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=12

# 切换到工作目录
cd /space/liangc/code/ovss/clip_dinoiser_919
# 禁用 wandb-core（可选）
export WANDB_USE_CORE=False
# 启动训练
srun torchrun \
    --nnodes=$SLURM_NNODES \
    --nproc_per_node=$SLURM_GPUS_PER_NODE \
    --rdzv_id=$SLURM_JOB_ID \
    --rdzv_backend=c10d \
    --rdzv_endpoint=$(scontrol show hostname $SLURM_NODELIST | head -n 1):0 \
    train_custom_ddp.py \
    /space/liangc/code/ovss/clip_dinoiser/configs/slurm_config/dinoclip_150_1+_1w_loss+_layer3-11_p_448_Tmax30.yaml